#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Test scheduling of gres/gpu and gres/mps
#
# Output:  "TEST: #.#" followed by "SUCCESS" if test was successful, OR
#          "FAILURE: ..." otherwise with an explanation of the failure, OR
#          anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2018 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
############################################################################
source ./globals

set test_id     "40.2"
set exit_code   0
set file_in1   "test$test_id.input1"
set file_in2   "test$test_id.input2"
set file_out1  "test$test_id.output1"
set file_out2  "test$test_id.output2"
set job_id      0

print_header $test_id

if {[test_cons_tres]} {
        send_user "\nValid configuration, using select/cons_tres\n"
} else {
        send_user "\nWARNING: This test is only compatible with select/cons_tres\n"
        exit 0
}

set def_part_name [default_partition]
set nb_nodes [get_node_cnt_in_part $def_part_name]

set mps_cnt [get_mps_count 1]
if {$mps_cnt < 0} {
	send_user "\nFAILURE: Error getting MPS count\n"
	exit 1
}
if {$mps_cnt < 1} {
	send_user "\nWARNING: This test requires 1 or more MPS in the default partition\n"
	exit 0
}
if {$mps_cnt < 100} {
#	This could be a legitimate configuration for testing purposes, but not likley for production use
	send_user "\nFAILURE: MPS count per node is unexpectedly low ($mps_cnt < 100). Check your configuration\n"
	set exit_code 1
}
send_user "\nMPS count is $mps_cnt\n"
if {$mps_cnt > 100} {
	set mps_cnt 100
}

#
# Simple MPS request, check environment variables
#
send_user "\n\n==== TEST 1 ====\n"
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME
echo CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES
echo CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
exit 0"
set timeout $max_job_delay
set device -1
set host "unknown"
set matches 0
set percentage -1
set target_job  [expr $mps_cnt / 2]
set srun_pid [spawn $srun --gres=mps:$target_job -n1 -t1 -J "test$test_id" $file_in1]
expect {
	-re "HOST:($alpha_numeric_under)" {
		incr matches
		set host $expect_out(1,string)
		exp_continue
	}
	-re "CUDA_VISIBLE_DEVICES:($number)" {
		if {$expect_out(1,string) != 0} {
			send_user "\nFAILURE: CUDA_VISIBLE_DEVICES != 0\n"
			set exit_code 1
		}
		incr matches
		exp_continue
	}
	-re "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		incr matches
		exp_continue
	}
	timeout {
                send_user "\nFAILURE: srun not responding\n"
                slow_kill $srun_pid
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$matches != 3} {
	send_user "\nFAILURE: output is bad ($matches != 3)\n"
	set exit_code 1
}

#
# Run two steps in parallel to consume gres/mps using sbatch
#
send_user "\n\n==== TEST 2 ====\n"
set target_job  [expr $mps_cnt / 2]
set target_step [expr $target_job / 2]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
make_bash_script $file_in2 "
$srun --mem=0 --gres=mps:$target_job  $file_in1 &
wait
$bin_date
$srun --mem=0 --gres=mps:$target_step $file_in1 &
$srun --mem=0 --gres=mps:$target_step $file_in1 &
wait
$bin_date
exit 0"
exec $bin_rm -f $file_out1
set job_id 0
set sbatch_pid [spawn $sbatch --gres=mps:$target_job -n1 -t1 -o $file_out1 -J "test$test_id" $file_in2]
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		exp_continue
	}
	timeout {
                send_user "\nFAILURE: sbatch not responding\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	send_user "\nFAILURE: job not submitted\n"
	set exit_code 1
}
if {[wait_for_job $job_id "DONE"] != 0} {
	send_user "\nFAILURE: job $job_id did not complete\n"
	cancel_job $job_id
	set exit_code 1
}
if {[wait_for_file $file_out1] != 0} {
	send_user "\nFAILURE: no output file\n"
	exit 1
}
set match 0
spawn $bin_cat $file_out1
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		if {$expect_out(1,string) != 0} {
			send_user "\nFAILURE: CUDA_VISIBLE_DEVICES != 0\n"
			set exit_code 1
		}
		incr match
		if {$match == 1} {
			set perc $expect_out(2,string)
		} else {
			if {$match == 2} {
				set perc_sum $expect_out(2,string)
			} else {
				if {$perc_sum != $expect_out(2,string)} {
					send_user "\nFAILURE: bad CUDA percentage ($perc_sum != $expect_out(2,string))\n"
					set exit_code 1
				} elseif {[expr $perc - $perc_sum - $expect_out(2,string)] > 1} {
					send_user "\nFAILURE: bad CUDA percentage ($perc != ($perc_sum - $expect_out(2,string))\n"
					set exit_code 1
				}
			}
		}
		exp_continue
	}
	eof {
		wait
	}
}
if {$match != 3} {
	send_user "\nFAILURE: bad CUDA information about job ($match != 3)\n"
	set exit_code 1
}

#
# Run two steps in parallel to consume gres/mps using salloc
# Reuse scripts from test 2 above
#
send_user "\n\n==== TEST 3 ====\n"
set match 0
set salloc_pid [spawn $salloc --gres=mps:$target_job -n1 -t1 -J "test$test_id" ./$file_in2]
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		if {$expect_out(1,string) != 0} {
			send_user "\nFAILURE: CUDA_VISIBLE_DEVICES != 0\n"
			set exit_code 1
		}
		incr match
		if {$match == 1} {
			set perc $expect_out(2,string)
		} else {
			if {$match == 2} {
				set perc_sum $expect_out(2,string)
			} else {
				if {$perc_sum != $expect_out(2,string)} {
					send_user "\nFAILURE: bad CUDA percentage ($perc_sum != $expect_out(2,string))\n"
					set exit_code 1
				} elseif {[expr $perc - $perc_sum - $expect_out(2,string)] > 1} {
					send_user "\nFAILURE: bad CUDA percentage ($perc != ($perc_sum - $expect_out(2,string))\n"
					set exit_code 1
				}
			}
		}
		exp_continue
	}
	timeout {
                send_user "\nFAILURE: salloc not responding\n"
                slow_kill $salloc_pid
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$match != 3} {
	send_user "\nFAILURE: bad CUDA information about job ($match != 3)\n"
	set exit_code 1
}

#
# Run three steps in parallel to make sure steps get delay as needed to avoid
# oversubscribing consumed MPS resources
#
send_user "\n\n==== TEST 4 ====\n"
set target_job  [expr $mps_cnt / 2]
set target_step [expr $target_job / 2]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
make_bash_script $file_in2 "
$srun --mem=0 --gres=mps:$target_step $file_in1 &
$srun --mem=0 --gres=mps:$target_step $file_in1 &
$srun --mem=0 --gres=mps:$target_step $file_in1 &
wait
$bin_date
exit 0"
exec $bin_rm -f $file_out1
set job_id 0
set sbatch_pid [spawn $sbatch --gres=mps:$target_job -n1 -t1 -o $file_out1 -J "test$test_id" $file_in2]
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		exp_continue
	}
	timeout {
                send_user "\nFAILURE: sbatch not responding\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	send_user "\nFAILURE: job not submitted\n"
	set exit_code 1
}
if {[wait_for_job $job_id "DONE"] != 0} {
	send_user "\nFAILURE: job $job_id did not complete\n"
	cancel_job $job_id
	set exit_code 1
}
if {[wait_for_file $file_out1] != 0} {
	send_user "\nFAILURE: no output file\n"
	exit 1
}
set match 0
spawn $bin_cat $file_out1
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		if {$expect_out(1,string) != 0} {
			send_user "\nFAILURE: CUDA_VISIBLE_DEVICES != 0\n"
			set exit_code 1
		}
		incr match
		if {$match == 1} {
			set perc $expect_out(2,string)
		} else {
			if {$perc != $expect_out(2,string)} {
				send_user "\nFAILURE: inconsistent step MPS percentages ($perc != $expect_out(2,string))\n"
				set exit_code 1
			}
		}
		exp_continue
	}
	eof {
		wait
	}
}
if {$match != 3} {
	send_user "\nFAILURE: bad CUDA information about job ($match != 3)\n"
	set exit_code 1
}
log_user 0
set match 0
spawn $bin_cat $file_out1
expect {
	-re "step creation temporarily disabled" {
		incr match
		exp_continue
	}
	eof {
		wait
	}
}
log_user 1
if {$match != 1} {
	send_user "\nFAILURE: failed to delay step for sufficient MPS resources ($match != 1)\n"
	set exit_code 1
}

#
# Run step to try to consume gres/mps than allocated to the job
#
send_user "\n\n==== TEST 5 ====\n"
set target_job  [expr $mps_cnt / 2]
set target_step [expr $target_job + 1]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
make_bash_script $file_in2 "
$srun --mem=0 --gres=mps:$target_step $file_in1
exit 0"
exec $bin_rm -f $file_out1
set job_id 0
set sbatch_pid [spawn $sbatch --gres=mps:$target_job -n1 -t1 -o $file_out1 -J "test$test_id" $file_in2]
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		exp_continue
	}
	timeout {
                send_user "\nFAILURE: sbatch not responding\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	send_user "\nFAILURE: job not submitted\n"
	set exit_code 1
}
if {[wait_for_job $job_id "DONE"] != 0} {
	send_user "\nFAILURE: job $job_id did not complete\n"
	cancel_job $job_id
	set exit_code 1
}
if {[wait_for_file $file_out1] != 0} {
	send_user "\nFAILURE: no output file\n"
	exit 1
}
set match 0
spawn $bin_cat $file_out1
expect {
	-re "Unable to create step" {
		send_user "\nError is expected, no worries.\n"
		incr match
		exp_continue
	}
	-re "CUDA_VISIBLE_DEVICES" {
		send_user "\nFAILURE: failed to reject bad step ($match != 1)\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$match != 1} {
	send_user "\nFAILURE: failed to reject bad step ($match != 1)\n"
	set exit_code 1
}

#
# Run multi-node job
#
send_user "\n\n==== TEST 6 ====\n"
set node_cnt [get_mps_node_count]
if {$node_cnt > 2} {
	set node_cnt 2
}
set target_job  [expr $mps_cnt / 2]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
set match 0
set hostname "NO_VAL"
set srun_pid [spawn $srun --gres=mps:$target_job -N$node_cnt -t1 -J "test$test_id" $file_in1]
expect {
	-re "HOST:($alpha_numeric_under)" {
		incr match
		if {$match == 1} {
			set hostname $expect_out(1,string)
		} elseif {[test_front_end]} {
			send_user "\nDuplicate SLURMD_HOSTNAME in front-end mode as expected\n"
		} elseif {[string compare $hostname $expect_out(1,string)] == 0} {
			send_user "\nFAILURE: Two tasks ran on same node ($hostname))\n"
			set exit_code 1
		}
		exp_continue
	}
	timeout {
                send_user "\nFAILURE: srun not responding\n"
                slow_kill $srun_pid
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$match != $node_cnt} {
	send_user "\nFAILURE: failed to get data from multiple nodes ($match != $node_cnt)\n"
	set exit_code 1
}

#
# Make sure that gres/gpu and gres/mps jobs either do not share the same GPU
# or run at different times
#
if {[test_front_end] == 0} {
	send_user "\n\n==== TEST 7 ====\n"
	set target_job [expr $mps_cnt / 2]
	make_bash_script $file_in1 "
	echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
	$scontrol -dd show job \$SLURM_JOB_ID
	$sbatch --gres=mps:$target_job -w \$SLURMD_NODENAME -n1 -t1 -o $file_out2 -J test$test_id $file_in2
	sleep 30
	exit 0"

	make_bash_script $file_in2 "
	echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
	$scontrol -dd show job \$SLURM_JOB_ID
	$squeue --name=test$test_id --noheader --state=r --format=\"jobid=%i state=%T\"
	exit 0"

	exec $bin_rm -f $file_out1 $file_out2
	set job_id1 0
	set sbatch_pid [spawn $sbatch --gres=gpu:1 -n1 -t1 -o $file_out1 -J "test$test_id" $file_in1]
	expect {
		-re "Submitted batch job ($number)" {
			set job_id1 $expect_out(1,string)
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: sbatch not responding\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}
	sleep 30
	if {[wait_for_job $job_id1 "DONE"] != 0} {
		send_user "\nFAILURE: job $job_id1 did not complete\n"
		cancel_job $job_id1
		set exit_code 1
	}
	if {[wait_for_file $file_out1] != 0} {
		send_user "\nFAILURE: no output file\n"
		exit 1
	}
	set dev1 -1
	set job_id2 0
	set match 0
	spawn $bin_cat $file_out1
	expect {
		-re "CUDA_VISIBLE_DEVICES:($number)" {
			incr match
			exp_continue
		}
		-re "gpu.IDX:($number)" {
			set dev1 $expect_out(1,string)
			exp_continue
		}
		-re "Submitted batch job ($number)" {
			set job_id2 $expect_out(1,string)
			exp_continue
		}
		eof {
			wait
		}
	}
	if {$match == 0} {
		send_user "\nFAILURE: CUDA_VISIBLE_DEVICES not set\n"
		set exit_code 1
	}
	if {$dev1 == -1} {
		send_user "\nFAILURE: GPU device index not found\n"
		set exit_code 1
	}
	if {$job_id2 == 0} {
		send_user "\nFAILURE: Failed to submit second job\n"
		set exit_code 1
	}

	send_user "\n\n"
	if {[wait_for_job $job_id2 "DONE"] != 0} {
		send_user "\nFAILURE: job $job_id2 did not complete\n"
		cancel_job $job_id2
		set exit_code 1
	}
	if {[wait_for_file $file_out2] != 0} {
		send_user "\nFAILURE: no output file\n"
		exit 1
	}
	set dev2 -1
	set match 0
	set running 0
	spawn $bin_cat $file_out2
	expect {
		-re "CUDA_VISIBLE_DEVICES:($number)" {
			incr match
			if {$match == 1} {
				set dev2 $expect_out(1,string)
			}
			exp_continue
		}
		-re "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
			incr match
			exp_continue
		}
		-re "mps.IDX:($number)" {
			set dev1 $expect_out(1,string)} {
			exp_continue
		}
		-re "jobid=$job_id1 state=RUNNING" {
			set running 1
			exp_continue
		}
		eof {
			wait
		}
	}
	if {$match != 2} {
		send_user "\nFAILURE: CUDA_VISIBLE_DEVICES or CUDA_MPS_ACTIVE_THREAD_PERCENTAGE not set\n"
		set exit_code 1
	}
	if {$dev2 == -1} {
		send_user "\nFAILURE: GPU device index not found\n"
		set exit_code 1
	} elseif {$dev1 == $dev2} {
		if {$running == 0} {
			send_user "\nThe jobs are using the same GPU ($dev1) and running at different times, which is fine\n"
		} else {
			send_user "\nFAILURE: The jobs are using the same GPU ($dev1) and running at the same time\n"
			set exit_code 1
		}
	} else {
		send_user "\nThe jobs are using different GPUs ($dev1 != $dev2), which is fine\n"
	}
}

#
# Clean up and exit
#
if {$exit_code == 0} {
	exec $bin_rm -f $file_in1 $file_in2 $file_out1 $file_out2
	send_user "\nSUCCESS\n"
}
exit $exit_code
